### specification test

order_tmp<-order(results_df$pid, decreasing=F)
results_df<-results_df[order_tmp,]

## compute semeter level avgs
results_df$sem_data_s_1<-rowMeans(results_df[,paste0("s", 1:4)], na.rm=T)
results_df$sem_data_s_2<-rowMeans(results_df[,paste0("s", 5:8)], na.rm=T)

df_wide<-subset(results_df, select=c("pid", "mu_s", "gpa1", "gpa2","sem_data_s_1", "sem_data_s_2", "sem1_s_model", "sem2_s_model", "sem1_y_model", "sem2_y_model" ))

df_wide$sem_model_s_1<-df_wide$sem1_s_model
df_wide$sem_model_s_2<-df_wide$sem2_s_model
df_wide$sem_model_y_1<-df_wide$sem1_y_model
df_wide$sem_model_y_2<-df_wide$sem2_y_model
df_wide$sem_data_y_1<-df_wide$gpa1
df_wide$sem_data_y_2<-df_wide$gpa2

df_wide<-subset(df_wide, select=c("pid","mu_s", "sem_data_s_1", "sem_data_s_2", "sem_model_s_1", "sem_model_s_2","sem_data_y_1", "sem_data_y_2", "sem_model_y_1", "sem_model_y_2" ))

pid_no_missing<-apply(df_wide, 1, function(x) max(is.na(x))==0)

df_wide_small<-df_wide[pid_no_missing,]
A_arrays_small<-A_array_months[pid_no_missing, pid_no_missing,]

## reshape from wide to long
df_long_small<-reshape(df_wide_small, varying=c("sem_data_s_1", "sem_data_s_2", "sem_model_s_1", "sem_model_s_2","sem_data_y_1", "sem_data_y_2", "sem_model_y_1", "sem_model_y_2"), v.names = c("sem_data_s", "sem_model_s", "sem_data_y", "sem_model_y"), direction="long", idvar="pid", timevar = "sem")

order_tmp<-order(df_long_small$sem, df_long_small$pid, decreasing=F)
df_long_small<-df_long_small[order_tmp,]

reg_s_1<-lm(sem_data_s ~ sem_model_s, data=df_long_small)
study_resid<-with(df_long_small, sem_data_s - sem_model_s)
n_tmp<-dim(df_wide_small)[1]
study_resid_1<-study_resid[1:n_tmp]
study_resid_2<-study_resid[(n_tmp+1):(n_tmp*2)]

avg_resid<-data.frame(avg_resid=tapply(study_resid, df_long_small$pid, mean))
avg_resid$pid<-row.names(avg_resid)

df_long_small<-merge(df_long_small, avg_resid, by="pid")

order_tmp<-order(df_long_small$sem, df_long_small$pid, decreasing=F)
df_long_small<-df_long_small[order_tmp,]

reg_s_2<-lm(sem_data_s ~ sem_model_s + avg_resid, data=df_long_small)

## check correlation between own and friend resids
## first semester friends
df_tmp<-subset(df_long_small, sem==1)

	total_mu_s_not_i_1<-A_arrays_small[,,1] %*% as.array(df_tmp$mu_s)
	total_friends_1<-A_arrays_small[,,1] %*% rep(1, length(df_tmp$mu_s))
	mu_s_not_i_1<-total_mu_s_not_i_1 / total_friends_1
	cor_results_mu_1<-(cor.test(df_tmp$mu_s, mu_s_not_i_1, use="complete"))

	total_resid_not_i_1<-A_arrays_small[,,1] %*% as.array(df_tmp$avg_resid)
	avg_resid_not_i_1<-total_resid_not_i_1 / total_friends_1
	cor_results_rho_1<-(cor.test(df_tmp$avg_resid, avg_resid_not_i_1, use="complete"))

## contemporaneous resid
	total_sem_1_resid_not_i_1<-A_arrays_small[,,1] %*% as.array(study_resid_1)
	avg_sem_1_resid_not_i_1<-total_sem_1_resid_not_i_1 / total_friends_1
	cor_results_rho_1_contemp<-(cor.test(study_resid_1, avg_sem_1_resid_not_i_1, use="complete"))

df_tmp<-subset(df_long_small, sem==2)

	total_mu_s_not_i_2<-A_arrays_small[,,2] %*% as.array(df_tmp$mu_s)
	total_friends_2<-A_arrays_small[,,2] %*% rep(1, length(df_tmp$mu_s))
	mu_s_not_i_2<-total_mu_s_not_i_2 / total_friends_2
	cor_results_mu_2<-(cor.test(df_tmp$mu_s, mu_s_not_i_2, use="complete"))

	total_resid_not_i_2<-A_arrays_small[,,2] %*% as.array(df_tmp$avg_resid)
	avg_resid_not_i_2<-total_resid_not_i_2 / total_friends_2
	cor_results_rho_2<-(cor.test(df_tmp$avg_resid, avg_resid_not_i_2, use="complete"))

## contemporaneous resid
	total_sem_2_resid_not_i_2<-A_arrays_small[,,2] %*% as.array(study_resid_2)
	avg_sem_2_resid_not_i_2<-total_sem_2_resid_not_i_2 / total_friends_2
	cor_results_rho_2_contemp<-(cor.test(study_resid_2, avg_sem_2_resid_not_i_2, use="complete"))

## stacked residuals
stack_avg_resid_not_i<-c(avg_resid_not_i_1, avg_resid_not_i_2)
stack_avg_resid_not_i_contemp<-c(avg_sem_1_resid_not_i_1, avg_sem_2_resid_not_i_2)

df_long_small<-cbind(df_long_small, stack_avg_resid_not_i, stack_avg_resid_not_i_contemp)

cor_results_mu_stacked<-(cor.test(df_long_small$mu_s, c(mu_s_not_i_1,mu_s_not_i_2), use="complete"))

cor_results_rho_stacked<-(cor.test(rep(avg_resid$avg_resid,2), stack_avg_resid_not_i, use="complete"))
cor_results_rho_stacked_contemp<-(cor.test(study_resid, stack_avg_resid_not_i_contemp, use="complete"))

	cor_results_table<-vector(length=4)
## mu
	nn<-cor_results_mu_1
	tmp_sem_1<-c("mu", nn$estimate, nn$statistic, nn$p.value)
	cor_results_table<-rbind(cor_results_table, tmp_sem_1)

	nn<-cor_results_mu_2
	tmp_sem_2<-c("mu", nn$estimate, nn$statistic, nn$p.value)
	cor_results_table<-	rbind(cor_results_table, tmp_sem_2)

	nn<-cor_results_mu_stacked
	tmp_stacked<-c("mu", nn$estimate, nn$statistic, nn$p.value)
	cor_results_table<-rbind(cor_results_table, tmp_stacked)

## rho
	nn<-cor_results_rho_1
	tmp_sem_1<-c("rho", nn$estimate, nn$statistic, nn$p.value)
	cor_results_table<-rbind(cor_results_table, tmp_sem_1)

	nn<-cor_results_rho_2
	tmp_sem_2<-c("rho", nn$estimate, nn$statistic, nn$p.value)
	cor_results_table<-	rbind(cor_results_table, tmp_sem_2)

	nn<-cor_results_rho_stacked
	tmp_stacked<-c("rho", nn$estimate, nn$statistic, nn$p.value)
	cor_results_table<-rbind(cor_results_table, tmp_stacked)

	cor_results_table<-cor_results_table[-1,]
	colnames(cor_results_table)[4]<-"pval"

	
stacked_contemp_own_resid<-c(study_resid_1, study_resid_2)
stacked_contemp_not_i_resid<-c(avg_sem_1_resid_not_i_1, avg_sem_2_resid_not_i_2)

## to cluster adjust
## first use contemporaneous residuals
tmp_df<-data.frame(tmp_id=rep(1:length(study_resid_1),2), own_resid=stacked_contemp_own_resid, friend_resid=stacked_contemp_not_i_resid)
tmp_df$own_resid<-tmp_df$own_resid / sd(tmp_df$own_resid, na.rm=T)
tmp_df$friend_resid<-tmp_df$friend_resid / sd(tmp_df$friend_resid, na.rm=T)
tmp_df_no_missing<- tmp_df[!is.na(tmp_df$friend_resid),]
cl_reg_contemp<-lm(own_resid ~ friend_resid, data=tmp_df_no_missing)

cl   <- function(dat,fm, cluster){
  require(sandwich, quietly = TRUE)
  require(lmtest, quietly = TRUE)
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- fm$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
  vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
  coeftest(fm, vcovCL) }

contemp_clustered_results<-cl(tmp_df_no_missing, cl_reg_contemp, tmp_df_no_missing$tmp_id)

## to cluster adjust
## second, use avg residuals
tmp_df<-data.frame(tmp_id=rep(1:length(study_resid_1),2), own_resid=rep(avg_resid$avg_resid,2), friend_resid=stack_avg_resid_not_i)
tmp_df$own_resid<-tmp_df$own_resid / sd(tmp_df$own_resid, na.rm=T)
tmp_df$friend_resid<-tmp_df$friend_resid / sd(tmp_df$friend_resid, na.rm=T)
tmp_df_no_missing<- tmp_df[!is.na(tmp_df$friend_resid),]
cl_reg_avg<-lm(own_resid ~ friend_resid, data=tmp_df_no_missing)

cl   <- function(dat,fm, cluster){
  require(sandwich, quietly = TRUE)
  require(lmtest, quietly = TRUE)
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- fm$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
  vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
  coeftest(fm, vcovCL) }

avg_clustered_results<-cl(tmp_df_no_missing, cl_reg_avg, tmp_df_no_missing$tmp_id)